library(rhdf5)
library(dplyr)
library(tidyr)
library(caret)
library(corrplot)
library(rpart)
library(ggplot2)

Introduction about this project (Financial Modeling Challenge from Kaggle)

Download the file

The data source is supplied by Kaggle: https://www.kaggle.com/c/two-sigma-financial-modeling/downloads/train.h5.zip

rm(list=ls())
setwd("~/Documents")
if (!file.exists("KaggleProject")) dir.create("KaggleProject")
fileUrl <- "https://www.kaggle.com/c/two-sigma-financial-modeling/downloads/train.h5.zip"
download.file(fileUrl, destfile = "./KaggleProject/train.h5.zip")
unzip("./KaggleProject/train.h5.zip",exdir="./KaggleProject")

Quick glance of the h5 file:

h5ls("train.h5")
##    group          name       otype  dclass           dim
## 0      /         train   H5I_GROUP                      
## 1 /train         axis0 H5I_DATASET  STRING           111
## 2 /train         axis1 H5I_DATASET INTEGER       1710756
## 3 /train  block0_items H5I_DATASET  STRING             2
## 4 /train block0_values H5I_DATASET INTEGER   2 x 1710756
## 5 /train  block1_items H5I_DATASET  STRING           109
## 6 /train block1_values H5I_DATASET   FLOAT 109 x 1710756

From there we can observe that the block1_items contains the names of each variables. The block1_values contains the real values we want.

Read the dataset

block1_items  <- h5read("train.h5",name = "train/block1_items")
block1_values  <- h5read("train.h5",name = "train/block1_values")

# Extract the data set "Train"
Train <- as.matrix(block1_values)
Train <- t(Train)
colnames(Train) <- block1_items
Train <- as.data.frame(Train)
dim(Train)
## [1] 1710756     109

Clean the data

Since there are always NA or NaN in the raw dataset, it is neccessary to remove them before real analysis

# Remove the observation that contains the NA or NaN values
good <- apply(Train,1,function(x) sum(is.na(x)))==0
Train <- Train[good,]
dim(Train)
## [1] 223040    109

Check out the correlation between each variables. Remove the ones highly correlated to others, so as to save the further modeling time.

# Check out the correlation between each variables
# png(filename = "FinancialCh.png", width = 4800,height = 4800,units = "px")
rrow <- sample(1:dim(Train)[1],50)
 corGraph <- cor(Train[rrow, ])
 corrplot(corGraph, order = "FPC", method = "number", type = "lower", 
         tl.cex = 0.8, tl.col = rgb(0, 0, 0),number.cex = 0.7, number.digits = 2)

# dev.off()

According to the plot, all of the following variables are correlated to other existed variables. So we are going to remove them. The list is shown below: fundamental_61,fundamental_11,fundamental_56,fundamental_26,fundamental_10,fundamental_15,fundamental_57,fundamental_41,fundamental_30,fundamental_53,fundamental_42,fundamental_26,fundamental_10,fundamental_60,fundamental_48,fundamental_55,fundamental_11,fundamental_45,fundamental_16,fundamental_34,fundamental_12,fundamental_51,fundamental_43,fundamental_1,fundamental_42,fundamental_30,fundamental_53

Train <- Train %>%
        select(-c(fundamental_61,fundamental_11,fundamental_56,fundamental_26,fundamental_10,fundamental_15,fundamental_57,fundamental_41,fundamental_30,fundamental_53,fundamental_42,fundamental_26,fundamental_10,fundamental_60,fundamental_48,fundamental_55,fundamental_11,fundamental_45,fundamental_16,fundamental_34,fundamental_12,fundamental_51,fundamental_43,fundamental_1,fundamental_42,fundamental_30,fundamental_53))
dim(Train)
## [1] 223040     88

Initial linear model by lm() function

set.seed(2017-01-10)
inTrain <- createDataPartition(Train$y,p=0.7, list=FALSE)
trainSet <- Train[inTrain,]
testSet <- Train[-inTrain,]
ini.fit <- lm(y~., data = trainSet)

# Predict the y of testSet to compare with the real y of testSet
pred_ini.fit <-predict.lm(ini.fit, newdata = testSet,interval = "none")

# R value 
R_sq <- 1- (sum((pred_ini.fit-testSet$y)^2))/(sum((testSet$y-mean(testSet$y))^2))
R_val<- sign(R_sq)*sqrt(abs(R_sq))
R_val        
## [1] 0.02471166

This R_val is not generalized. It does depends on the set.seed(). In order to get the more accurate estimation of the R-value, we need to send 50 or more random seeds and average the R_val. Now this is just a glance of the R_val. We are going to re-calculate again once the other parameters having been settled down.

Calculate the train_error and test_error for the ini.fit; plot these two errors versus the number of the observation we used. This can give you some ideas about whether the modeling is high bias or high variance.

set.seed(2017-01-10)
R_val <- NULL
J_train <- NULL
J_test <- NULL
for (obs in rw<-c(5000,10000,25000,50000,75000,100000)){
        A <- trainSet[1:obs,]
        fit <- lm(y~., data = A)
        pred_train <-predict.lm(ini.fit, newdata = A,interval = "none")
        pred_cv <-predict.lm(ini.fit, newdata = testSet,interval = "none")
        
        i <- which(obs==rw)
        
        R_sq <- 1- (sum((pred_cv-testSet$y)^2))/(sum((testSet$y-mean(testSet$y))^2))
        R_val[i] <- sign(R_sq)*sqrt(abs(R_sq))
        
        J_train[i] <- 1/(2*obs)*sum((pred_train-trainSet[1:obs,]$y)^2)
        J_test[i] <- 1/(2*obs)*sum((pred_cv-testSet$y)^2)
}
rw <- as.data.frame(rw)
R_val <- as.data.frame(R_val)
J_train <- as.data.frame(J_train)
J_test <- as.data.frame(J_test)
R_val <- cbind(R_val,rw)
J_ <- cbind(J_train,J_test,rw)
J_error <- J_ %>%
        gather(type,errors,-rw)
g <- ggplot(data = J_error,aes(x=rw,y=errors,color=type))
g <- g + geom_point() + geom_line()
g <- g + labs(x="numbers of observations")
g <- g + ggtitle("Errors versus the number of observations") +
  theme(plot.title = element_text(hjust = 0.5))
g

According to the train_error (green) and test_error (red) lines, they both converged to a quite small value ~0.0001. This means that the model does NOT undergo the high bias or high variance. Since they intersected when the observation number was 75000, so in futuer test we can set the numbers of 75000 for the quick modeling filering.

Filter the variables to optimize the modeling

Not all of the variables are neccessary for a perfect model. Sometimes redundant variables has nothing to do with the accuracy but slow down the computing rate.

set.seed(2017-01-10)
Set <- Train[sample(1:dim(Train)[1], 75000),]
inifit<- lm(y~., data=Set)
#do.call(anova,lm.ls)
bestfit <- step(inifit,direction = "both")
bestfit$call
## lm(formula = y ~ fundamental_5 + fundamental_7 + fundamental_14 + 
##     fundamental_17 + fundamental_19 + fundamental_20 + fundamental_21 + 
##     fundamental_31 + fundamental_33 + fundamental_36 + fundamental_40 + 
##     fundamental_44 + fundamental_46 + fundamental_47 + fundamental_49 + 
##     fundamental_63 + technical_0 + technical_7 + technical_11 + 
##     technical_16 + technical_20 + technical_21 + technical_22 + 
##     technical_27 + technical_30 + technical_32 + technical_36 + 
##     technical_37 + technical_44, data = Set)
fit_final <- lm(formula = y ~ fundamental_5 + fundamental_7 + fundamental_14 + 
     fundamental_17 + fundamental_19 + fundamental_20 + fundamental_21 + 
     fundamental_31 + fundamental_33 + fundamental_36 + fundamental_40 + 
     fundamental_44 + fundamental_46 + fundamental_47 + fundamental_49 + 
     fundamental_63 + technical_0 + technical_7 + technical_11 + 
     technical_16 + technical_20 + technical_21 + technical_22 + 
     technical_27 + technical_30 + technical_32 + technical_36 + 
     technical_37 + technical_44, data = trainSet)
pred.train <-predict.lm(fit_final, newdata = trainSet,interval = "none")
pred.cv <-predict.lm(fit_final, newdata = testSet,interval = "none")
J.train <- 1/(2*obs)*sum((pred.train-trainSet$y)^2)
J.test <- 1/(2*obs)*sum((pred.cv-testSet$y)^2)        
sprintf("The train_error is %.5f", J.train)
## [1] "The train_error is 0.00022"
sprintf("The test_error is %.5f", J.test)
## [1] "The test_error is 0.00009"

Both the train_error and test_error are close to a small value of 0.0001. This indicates that the model does NOT undergo high bias or high variance.

Calculate the R_val to grade the linear model

In order to get more close to the real value in order to decrease the random effect from the data set partition, we set up 100 times partition to obtain 100 variant train.Set and test.Set. The R value will be finally averaged and estimated there.

# Modeling fit
R.val <- rep(0,100)

for (i in 1:100) {
        set.seed(i)
        inTrain <- createDataPartition(Train$y,p=0.7, list=FALSE)
        train.Set <- Train[inTrain,]
        test.Set <- Train[-inTrain,]
        fit_ <- lm(y~fundamental_5 + fundamental_7 + fundamental_14 + 
     fundamental_17 + fundamental_19 + fundamental_20 + fundamental_21 + 
     fundamental_31 + fundamental_33 + fundamental_36 + fundamental_40 + 
     fundamental_44 + fundamental_46 + fundamental_47 + fundamental_49 + 
     fundamental_63 + technical_0 + technical_7 + technical_11 + 
     technical_16 + technical_20 + technical_21 + technical_22 + 
     technical_27 + technical_30 + technical_32 + technical_36 + 
     technical_37 + technical_44, data = train.Set, na.action=na.exclude)
        
        # Predict the y of testSet to compare with the real y of testSet
        pred.lm <-predict.lm(fit_, newdata = test.Set,interval = "none")
        
        # R value 
        R.sq <- 1- (sum((pred.lm-test.Set$y)^2))/(sum((test.Set$y-mean(test.Set$y))^2))
        R.val[i] <- sign(R.sq)*sqrt(abs(R.sq))
        
}
df <- sapply(R.val,function(x) mean(sample(x,10,replace = TRUE)))
df <- round(df,4)
round(mean(df))
## [1] 0
hist(df,breaks = 10)